Setup
## [1] "using data from data/responses_2025-05-05.csv"
Sample
Plot of Data Value Coding Challenges
## these comments were manually identified as being related to value coding
survey$challenge_other_describe[survey$timestamp %in% c("4/17/2025 17:08:47", "4/30/2025 15:37:08")]
## [1] "Determining whether empty cells indicated \"no\" versus indicated missing data was challenging. For example, medical history or other measures might have a 1 if a condition was endorsed, but have nothing if it was not endorsed. When that measure was then combined with others, empty cells could mean that participant wasn't included in the first dataaset (e.g., never completed medical history so should be missing data) or that they had a \"no\" for all of the items in the medical history."
## [2] "Variable reconciliation is always a challenge. I always have to be mindful of how variables are coded for analysis, particularly if those involve creating summary scores. We have noticed a few data errors, but the rate is very very low. Sometimes there is missing data and it is not clear why the data is missing. "
#### describe sample ####
# where those who responded to the coding different than those who didnt?
table1(
x = ~ education + n_pheno_datasets + processing_proficiency + compiled_pheno_role_hands_on| share_coding_preferences_check,
data = survey
)
| No (N=27) |
Yes (N=17) |
Overall (N=44) |
|
|---|---|---|---|
| education | |||
| Bachelor's degree | 1 (3.7%) | 2 (11.8%) | 3 (6.8%) |
| Doctoral degree (e.g., PhD) | 19 (70.4%) | 11 (64.7%) | 30 (68.2%) |
| Masters's degree | 3 (11.1%) | 3 (17.6%) | 6 (13.6%) |
| MD, MPH, MA Ethics in Medicine | 1 (3.7%) | 0 (0%) | 1 (2.3%) |
| Professional degree (e.g., MD, JD, DDS) | 3 (11.1%) | 1 (5.9%) | 4 (9.1%) |
| n_pheno_datasets | |||
| 0 | 12 (44.4%) | 5 (29.4%) | 17 (38.6%) |
| 1-2 | 6 (22.2%) | 6 (35.3%) | 12 (27.3%) |
| 3-5 | 1 (3.7%) | 3 (17.6%) | 4 (9.1%) |
| 6-10 | 0 (0%) | 0 (0%) | 0 (0%) |
| 10+ | 8 (29.6%) | 3 (17.6%) | 11 (25.0%) |
| processing_proficiency | |||
| 1 | 1 (3.7%) | 0 (0%) | 1 (2.3%) |
| 2 | 3 (11.1%) | 1 (5.9%) | 4 (9.1%) |
| 3 | 5 (18.5%) | 3 (17.6%) | 8 (18.2%) |
| 4 | 9 (33.3%) | 5 (29.4%) | 14 (31.8%) |
| 5 | 7 (25.9%) | 7 (41.2%) | 14 (31.8%) |
| Missing | 2 (7.4%) | 1 (5.9%) | 3 (6.8%) |
| compiled_pheno_role_hands_on | |||
| no | 10 (37.0%) | 2 (11.8%) | 12 (27.3%) |
| yes | 17 (63.0%) | 15 (88.2%) | 32 (72.7%) |
table1(
x = ~ education + current_role + compiled_pheno_role_hands_on,
data = coding
)
| Overall (N=17) |
|
|---|---|
| education | |
| Bachelor's degree | 2 (11.8%) |
| Doctoral degree (e.g., PhD) | 11 (64.7%) |
| Masters's degree | 3 (17.6%) |
| Professional degree (e.g., MD, JD, DDS) | 1 (5.9%) |
| current_role | |
| Assistant Professor | 4 (23.5%) |
| Associate Professor | 1 (5.9%) |
| Doctoral Student (PhD, EdD, etc.) | 2 (11.8%) |
| Government Researcher | 1 (5.9%) |
| Master’s Student | 2 (11.8%) |
| Postdoctoral Researcher | 2 (11.8%) |
| Professor | 2 (11.8%) |
| Research Scientist | 1 (5.9%) |
| Simons Searchlight internal team | 1 (5.9%) |
| Missing | 1 (5.9%) |
| compiled_pheno_role_hands_on | |
| no | 2 (11.8%) |
| yes | 15 (88.2%) |
# project roles -- figure out a better way to display and summarize this
role_items <- grep("pheno_roles_", names(coding), value = TRUE)
coding %>%
select(all_of(role_items)) %>%
pivot_longer(cols = everything(), names_to = "tool", values_to = "selected") %>%
filter(selected == "yes") %>%
count(tool) %>%
ggplot(aes(x = reorder(tool, -n), y = n)) +
geom_col(fill = "steelblue") +
labs(x = "Role with data", y = "Count", title = "Number of Participants Selecting Each Role")
## software tools -- figure out a better way to display and summarize this
tool_labels <- c(
"proc_tool_excel_automate" = "Excel Adv",
"proc_tool_excel_basic" = "Excel Basic",
"proc_tool_sas" = "SAS",
"proc_tool_stata" = "Stata",
"proc_tool_r" = "R",
"proc_tool_spss" = "SPSS",
"proc_tool_python" = "Python",
"proc_tool_jupyter" = "Jupyter Notebook"
)
proc_tools_items <- grep("proc_tool", names(coding), value = TRUE)
coding %>%
select(all_of(proc_tools_items)) %>%
pivot_longer(cols = everything(), names_to = "tool", values_to = "selected") %>%
filter(selected == "yes") %>%
count(tool) %>%
ggplot(aes(x = reorder(tool, -n), y = n)) +
geom_col(fill = "steelblue") +
labs(x = "Processing Tool", y = "Count", title = "Number of Participants Selecting Each Processing Tool") +
scale_x_discrete(labels = tool_labels) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# count excel alone
sum(
coding$proc_tool_excel_basic == "yes" &
coding$proc_tool_excel_automate == "no" &
coding$proc_tool_r == "no" &
coding$proc_tool_python == "no" &
coding$proc_tool_spss == "no" &
coding$proc_tool_stata == "no" &
coding$proc_tool_sas == "no"
)
## [1] 1
# count basic excel + SPSS only
sum(
coding$proc_tool_excel_basic == "yes" &
coding$proc_tool_excel_automate == "no" &
coding$proc_tool_r == "no" &
coding$proc_tool_python == "no" &
coding$proc_tool_spss == "yes" &
coding$proc_tool_stata == "no" &
coding$proc_tool_sas == "no"
)
## [1] 1
# count basic excel + stata only
sum(
coding$proc_tool_excel_basic == "yes" &
coding$proc_tool_excel_automate == "no" &
coding$proc_tool_r == "no" &
coding$proc_tool_python == "no" &
coding$proc_tool_spss == "no" &
coding$proc_tool_stata == "yes" &
coding$proc_tool_sas == "no"
)
## [1] 1
# count basic excel + programming
sum(
coding$proc_tool_excel_basic == "yes" &
(coding$proc_tool_excel_automate == "yes" |
coding$proc_tool_r == "yes" |
coding$proc_tool_python == "yes" | coding$proc_tool_sas == "yes")
)
## [1] 6
# frequency of responses for processing_proficiency
barplot(table(coding$n_pheno_datasets), main = "Frequency: Number of pheno datasets pre-SFARI")
# frequency of responses for processing_proficiency
barplot(table(coding$processing_proficiency), main = "Frequency: How proficient are you (1-5)")
coding_red <- coding[grep("^coding|role|processing_proficiency|^proc_tool", names(coding))]
barplot(table(coding$coding_boolean), main = "Preference for boolean value codes")
coding$coding_boolean_describe[coding$coding_boolean %in% c("4", "5") & !is.na(coding$coding_boolean_describe)]
## [1] "integer codes will always require referring to a different document/dictionary. I would prefer to enter data as labels and recode as needed. I also have an issues with students using integer codes as if they are ordinal or continuous when they really are discrete values. "
## [2] "easier for me to recode as needed, rather than continually going back to remember what was what"
## [3] "using numbers creates the possibility for confusion by naive users as they may assume it's actually a number rather than a category. also, reduces workload by eliminating the need to assign labels. "
## [4] "Easy to understand."
## [5] "Using string or Boolean is a lot more clear because there are many other numeric variables"
coding$coding_boolean_describe[coding$coding_boolean %in% c("1", "2") & !is.na(coding$coding_boolean_describe)]
## [1] "For me this makes the variables easier to use in analyses, and I can always consult the data dictionary to verify that the integers mean what I think they would mean."
## [2] "functionally equivalent, but takes less space"
coding$coding_boolean_describe[coding$coding_boolean == "3" & !is.na(coding$coding_boolean_describe)]
## character(0)
barplot(table(coding$coding_nominal), main = "Preference for nomical categorical")
coding$coding_nominal_describe[coding$coding_nominal %in% c("4", "5") & !is.na(coding$coding_nominal_describe)]
## [1] "Same as above"
## [2] "easier for me to recode as needed, rather than continually going back to remember what was what"
## [3] "same as above "
## [4] "Easier for visual inspection and less room for error in converting from numbers to categories."
## [5] "I mostly use these data for building machine learning models. If the data is inherently unordered, I would like to process them differently, rather than implicitly assigning some order due to integer coding."
## [6] "For the non-ordered categorical data, it might be easier to have access to strings. This would especially be the case if there are many values (e.g., 14) because I could more easily create numerical coding. "
## [7] "This type of categorical variables with short responses is always better when it is strings. Strings are intuitive to visualize and understand. "
coding$coding_nominal_describe[coding$coding_nominal %in% c("1", "2") & !is.na(coding$coding_nominal_describe)]
## [1] "R generally becomes unwell when too many spaces in options. "
## [2] "essential keys are clear and consistent, but numerical preference for storage"
## [3] "More suitable for codes programming."
coding$coding_nominal_describe[coding$coding_nominal == "3" & !is.na(coding$coding_nominal_describe)]
## character(0)
barplot(table(coding$coding_ordinal), main = "Preference for ordinal categorical")
coding$coding_ordinal_describe[coding$coding_ordinal %in% c("4", "5") & !is.na(coding$coding_ordinal_describe)]
## [1] "a problem here is that the levels of the strings will be alphanumeric, so would prefer strings that would maintain the ordinal nature of the data, like \"0.lessthan6m\", \"1.6to12m\", etc.. "
## [2] "easier for me to recode as needed, rather than continually going back to remember what was what"
## [3] "same as above"
## [4] "Same as above"
## [5] "Again, short responses are better when they are strings."
coding$coding_ordinal_describe[coding$coding_ordinal %in% c("1", "2") & !is.na(coding$coding_ordinal_describe)]
## [1] "As above"
## [2] "More suitable for codes programming."
coding$coding_ordinal_describe[coding$coding_ordinal == "3" & !is.na(coding$coding_ordinal_describe)]
## character(0)
barplot(table(coding$coding_dk_pna), main = "Preference for special cases")
coding$coding_dk_pna_describe[coding$coding_dk_pna %in% c("4", "5") & !is.na(coding$coding_dk_pna_describe)]
## [1] "easier for me to recode as needed, rather than continually going back to remember what was what"
## [2] "my preference is because naive users don't do data checks and return a mean value for age of diagnosis of 437 years. help people help themselves! :-)"
## [3] "numerical out of range coding can be very dangerous, extremely susceptible to misinterpretation and best avoided"
## [4] "If dontknow is used, then when I reformat the column into numeric, that would turn into NAs directly and will not subsequently impact the calculation of mean, SDs, correlations, etc. "
coding$coding_dk_pna_describe[coding$coding_dk_pna %in% c("1", "2") & !is.na(coding$coding_dk_pna_describe)]
## [1] "numeric variables should only contain numerals. The problem is when you do not know all the possible out of range options, you sometimes overlook and then a random large number gets input. I feel like this was a problem with SSC ADI -- where there were codes like 789, 788, 888, 999 (making up numbers because I don't remember offhand) that meant different things -- like missing at random, unknown, no answer given, unable to asses due to age, etc."
## [2] "More suitable for codes programming."
coding$coding_dk_pna_describe[coding$coding_dk_pna == "3" & !is.na(coding$coding_dk_pna_describe)]
## [1] "Pros and cons. If 999, application might inadvertently think entire column is numerical and impose an age of 999 for the non-attentive researcher. If dont_know, then might make the rest of the column harder to analyze since distorting the data type. "
## [2] "I think clearly out of range scores for numerical values are useful. Typically I import data into R and have code that indicates what is to be read in as NA, so it might not matter necessarily and so I selected 3."
barplot(table(coding$coding_skipped), main = "Preference for skipped")
coding$coding_skipped_describe[coding$coding_skipped %in% c("4", "5") & !is.na(coding$coding_skipped_describe)]
## [1] "it saves data wrangling time if conditional missing is hard coded. otherwise i have to process the data (i.e., is q1 = no?) to distinguish between valid null and invalid null. "
## [2] "I very much appreciate the skipped logic code that is different from a missing code rather than having a blank cell. "
coding$coding_skipped_describe[coding$coding_skipped %in% c("1", "2") & !is.na(coding$coding_skipped_describe)]
## [1] "I do not have any personal need to distinguish whether it was shown or now"
## [2] "As above, best avoid out of range numerical coding"
## [3] "\nEasy to understand."
coding$coding_skipped_describe[coding$coding_skipped == "3" & !is.na(coding$coding_skipped_describe)]
## [1] "no preference"
barplot(table(coding$coding_checklists), main = "Preference for checklists")
coding$coding_checklists_describe[coding$coding_checklists_describe == "checked = 1; unchecked = 0 or null depending on context" & !is.na(coding$coding_checklists_describe)]
## character(0)
coding$coding_checklists_describe[coding$coding_checklists_describe == "checked = 1; unchecked = 0" & !is.na(coding$coding_checklists_describe)]
## character(0)
coding$coding_checklists_describe[coding$coding_checklists_describe == "checked = 1; unchecked = null" & !is.na(coding$coding_checklists_describe)]
## character(0)